import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import math
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as offline
import cufflinks as cf
py.init_notebook_mode(connected=True)
init_notebook_mode(connected=True)
offline.init_notebook_mode()
cf.go_offline()
color = sns.color_palette()
heloc = pd.read_excel("heloc_dataset_v1.xlsx")
display(heloc.tail(n=5).transpose())
# Investigate balance for both output classes
n_records = heloc.shape[0]
n_riskperf_good = heloc.loc[heloc['RiskPerformance'] == 'Good'].shape[0]
n_riskperf_bad = heloc.loc[heloc['RiskPerformance'] == 'Bad'].shape[0]
good_percent = (n_riskperf_good/n_records)*100
print("Total number of records: {:0,}".format(n_records))
print("Individuals with good RiskPerformance: {:0,}".format(n_riskperf_good))
print("Individuals with bad RiskPerformance: {:0,}".format(n_riskperf_bad))
print("Percentage of individuals with good RiskPerformance: {:0.2f}%".format(good_percent))
temp = heloc["RiskPerformance"].value_counts()
df = pd.DataFrame({'labels': temp.index,
'values': temp.values
})
print(df)
df.iplot(kind='pie',labels='labels',values='values', title='RiskPerformance')
FICO uses 5 broad categories of information, each assigned with a relative importance, to determine credit score. The features provided in the input dataset have been subjectively mapped to these categories below (relative weight in bracket) in order to develop a better intuition of the input features.
display(heloc.describe().transpose())
print("{:30s} {:} {:} {:}".format("Feature", "No Record/Invest.", "No Usable/Valid Trade/Inq", "Cond. not met" ))
for col in heloc.columns:
no_record = heloc.loc[heloc[col] == -9].shape[0]
no_validtrades = heloc.loc[heloc[col] == -8].shape[0]
no_inq_delq = heloc.loc[heloc[col] == -7].shape[0]
print("{:30s} \t {:0,} \t\t\t {:0,} \t\t\t {:0,}".format(col[:30], no_record, no_validtrades , no_inq_delq ))
There are 3 special values in the HELOC dataset:
According to the data provider, when a loan applicant’s credit bureau report was either not investigated or not found, all features obtained from the credit bureau report receive a special value of -9. The confounding of no bureau report investigated (most likely a VIP applicant) and no bureau report found (a negative trait for extending credit) is most likely responsible for the two different outcomes.
All such records will be removed from the dataset due to their confounding nature.
no_rec= (heloc.loc[heloc['MSinceOldestTradeOpen'] == -9])
#display(no_rec)
display(no_rec.describe().transpose())
#Remove 588 entries with no bureau records
clean_norec_heloc = heloc.copy()
clean_norec_heloc.drop(clean_norec_heloc[clean_norec_heloc.MSinceOldestTradeOpen == -9].index, inplace=True)
n_unknown_DelqLast12M= (clean_norec_heloc.loc[clean_norec_heloc['MaxDelq2PublicRecLast12M'] > 7]).shape[0]
n_unknown_DelqEver_1= (clean_norec_heloc.loc[clean_norec_heloc['MaxDelqEver'] == 1]).shape[0]
n_unknown_DelqEver_7= (clean_norec_heloc.loc[clean_norec_heloc['MaxDelqEver'] == 7]).shape[0]
n_unknown_DelqEver_9= (clean_norec_heloc.loc[clean_norec_heloc['MaxDelqEver'] == 9]).shape[0]
print("Individuals with unusable delinquency data over last 12 months: {:0,}".format(n_unknown_DelqLast12M))
print("Individuals with no delinquency data: {:0,}".format(n_unknown_DelqEver_1))
print("Individuals with unknown delinquency: {:0,}".format(n_unknown_DelqEver_7))
print("Individuals with unusable delinquency data: {:0,}".format(n_unknown_DelqEver_9))
display(clean_norec_heloc.loc[clean_norec_heloc['MaxDelq2PublicRecLast12M'] > 7].transpose())
clean_nodelq_heloc = clean_norec_heloc.copy()
clean_nodelq_heloc.drop(clean_nodelq_heloc[clean_nodelq_heloc.MaxDelq2PublicRecLast12M > 7].index, inplace=True)
display(clean_nodelq_heloc.describe().transpose())
Based on the correlation heatmap below, the following features exhibit strong correlation:
from seaborn import heatmap
# Pretty display for notebooks
%matplotlib inline
plt.figure(figsize=(25, 25))
heatmap(clean_nodelq_heloc.iloc[:, 1:].corr(),annot= True, square= True, cmap="RdYlGn")
#clean_nodelq_heloc.drop(['NumInqLast6M','NumTrades90Ever2DerogPubRec'],axis=1, inplace=True)
clean_nodelq_heloc['NumTradesNeverDelq'] = np.rint(clean_nodelq_heloc['PercentTradesNeverDelq']* clean_nodelq_heloc['NumTotalTrades']/100)
clean_nodelq_heloc['NumInstallTrades'] = np.rint(clean_nodelq_heloc['PercentInstallTrades']* clean_nodelq_heloc['NumTotalTrades']/100)
clean_nodelq_heloc['NumTradesWBalance'] = np.rint(clean_nodelq_heloc['PercentTradesWBalance']* clean_nodelq_heloc['NumTotalTrades']/100)
display(clean_nodelq_heloc.head().transpose())
The features MaxDelq2PublicRecLast12M and MaxDelqEver will have their values grouped in the following categories:
These categories will be one-hot encoded to add them as distinct features within the dataset.
delq12M_map = {1: 'DelinquentLast12M',
2: 'DelinquentLast12M',
3: 'DelinquentLast12M',
4: 'DelinquentLast12M',
5: 'InvalidDelqLast12M',
6: 'InvalidDelqLast12M',
7: 'NotDelinquentLast12M',
8: 'InvalidDelqLast12M',
9: 'InvalidDelqLast12M',
0: 'DerogatoryLast12M'}
delqEver_map = {1: 'InvalidDelq',
2: 'Derogatory',
3: 'Delinquent',
4: 'Delinquent',
5: 'Delinquent',
6: 'Delinquent',
7: 'InvalidDelq',
8: 'NeverDelinquent',
9: 'InvalidDelq',
}
clean_nodelq_heloc['MaxDelq2PublicRecLast12MLabel'] = clean_nodelq_heloc['MaxDelq2PublicRecLast12M'].map(delq12M_map)
clean_nodelq_heloc['MaxDelqEverLabel'] = clean_nodelq_heloc['MaxDelqEver'].map(delqEver_map)
full_features = pd.get_dummies(clean_nodelq_heloc,columns=["MaxDelq2PublicRecLast12MLabel","MaxDelqEverLabel"])
encoded = list(full_features.columns)
print("{} total features after one-hot encoding. \n".format(len(encoded)))
display(full_features.head().transpose())
plt.figure(figsize=(25, 25))
heatmap(full_features.iloc[:, 1:].corr(),annot= True, square= True, cmap="RdYlGn")
The feature pairs exhibiting absolute correlation greater than 0.80 (in descending order) are listed below:
In addition, the following features will be dropped as they have already been used to derive other features:
features_drop =['NumInqLast6M','NumTotalTrades','MaxDelq2PublicRecLast12MLabel_NotDelinquentLast12M',
'NumTrades90Ever2DerogPubRec','NumTradesNeverDelq','MaxDelqEverLabel_Delinquent'
,'PercentTradesNeverDelq','PercentTradesWBalance','PercentInstallTrades',
'MaxDelq2PublicRecLast12M','MaxDelqEver','MaxDelq2PublicRecLast12MLabel_InvalidDelqLast12M',
'MaxDelqEverLabel_InvalidDelq']
reduced_features = full_features.copy()
reduced_features.drop(features_drop,axis=1, inplace=True)
num_features = list(reduced_features.columns)
print("{} Number of features after reduction \n".format(len(num_features)))
display(reduced_features.head().transpose())
plt.figure(figsize=(25, 25))
heatmap(reduced_features.iloc[:, 1:].corr(),annot= True, square= True, cmap="RdYlGn", linewidths=.05)
#Produce KDEplot for a specific feature - visualise distribution of continuous numeric feature
def kdeplt(feature, rnd =-1):
xlow= round(reduced_features[feature].min(),-1)
xhigh =round(reduced_features[feature].max(),rnd)
print('Low boundary : ', xlow,'High boundary : ', xhigh)
plt.figure(figsize=(7,4))
ax = sns.kdeplot(reduced_features[feature][reduced_features['RiskPerformance'] == 'Good'], color="darkturquoise", shade=True)
sns.kdeplot(reduced_features[feature][reduced_features['RiskPerformance'] == 'Bad'], color="lightcoral", shade=True)
plt.legend(['Good', 'Bad'])
plt.title('Loan types by '+feature)
ax.set(xlabel=feature)
plt.xlim(xlow,xhigh)
plt.show()
#Multi-bar Plot - discrete categorical attributes
def barplot(feature):
plt.figure(figsize=(4,4))
cp = sns.countplot(x=feature, hue="RiskPerformance", data=reduced_features)
# Using multiple Histograms - visualising numeric and categorical attributes together
def multhist(feature):
fig = plt.figure(figsize = (6, 4))
title = fig.suptitle('Loan types by '+feature)
#fig.subplots_adjust(top=0.85, wspace=0.3)
ax = fig.add_subplot(1,1, 1)
ax.set_xlabel("MSinceOldestTradeOpen")
ax.set_ylabel("Count")
g = sns.FacetGrid(reduced_features, hue='RiskPerformance')
g.map(sns.distplot, feature, kde=False, bins=20, ax=ax)
ax.legend(title='RiskPerformance')
plt.close(2)
reduced_features['ExternalRiskEstimate'].describe()
kdeplt('ExternalRiskEstimate',-2)
reduced_features['MSinceOldestTradeOpen'].describe()
kdeplt('MSinceOldestTradeOpen',-3)
reduced_features['MSinceMostRecentTradeOpen'].describe()
multhist('MSinceMostRecentTradeOpen')
reduced_features['AverageMInFile'].describe()
kdeplt('AverageMInFile')
reduced_features['NumSatisfactoryTrades'].describe()
kdeplt('NumSatisfactoryTrades')
reduced_features['NumTrades60Ever2DerogPubRec'].describe()
multhist('NumTrades60Ever2DerogPubRec')
reduced_features['MSinceMostRecentDelq'].describe()
multhist('MSinceMostRecentDelq')
reduced_features['NumTradesOpeninLast12M'].describe()
kdeplt('NumTradesOpeninLast12M')
reduced_features['MSinceMostRecentInqexcl7days'].describe()
multhist('MSinceMostRecentInqexcl7days')
reduced_features['NumInqLast6Mexcl7days'].describe()
multhist('NumInqLast6Mexcl7days')
reduced_features['NetFractionRevolvingBurden'].describe()
kdeplt('NetFractionRevolvingBurden')
reduced_features['NetFractionInstallBurden'].describe()
kdeplt('NetFractionInstallBurden')
reduced_features['NumRevolvingTradesWBalance'].describe()
kdeplt('NumRevolvingTradesWBalance')
reduced_features['NumInstallTradesWBalance'].describe()
multhist('NumInstallTradesWBalance')
reduced_features['NumBank2NatlTradesWHighUtilization'].describe()
multhist('NumBank2NatlTradesWHighUtilization')
reduced_features['NumInstallTrades'].describe()
multhist('NumInstallTrades')
reduced_features['NumTradesWBalance'].describe()
multhist('NumTradesWBalance')
reduced_features['MaxDelq2PublicRecLast12MLabel_DelinquentLast12M'].value_counts()
barplot('MaxDelq2PublicRecLast12MLabel_DelinquentLast12M')
reduced_features['MaxDelq2PublicRecLast12MLabel_DerogatoryLast12M'].value_counts()
barplot('MaxDelq2PublicRecLast12MLabel_DerogatoryLast12M')
reduced_features['MaxDelqEverLabel_Derogatory'].value_counts()
barplot('MaxDelqEverLabel_Derogatory')
reduced_features['MaxDelqEverLabel_NeverDelinquent'].value_counts()
barplot('MaxDelqEverLabel_NeverDelinquent')
The first 2 principal components explain almost 85% of the variance in the data. The key features of the first three components are listed below:
from sklearn.decomposition import PCA
import visuals as vs
pca = PCA(n_components=3)
pca.fit(reduced_features.iloc[:, 1:])
# Generate PCA results plot
pca_results = vs.pca_results(reduced_features.iloc[:, 1:], pca,24,20)
The maximum score of 71.6% is obtained when 18 features are used. From a dimensionality reduction perspective, 13 features provide a reasonable tradeoff between score (71.5%) and number of features. The 8 selected features can be grouped into 4 categories:
When ExternalRiskEstimate is dropped from the list of features and the recursive feature selection is rerun, the following 13 features result in a score of 71.5%:
reduced_features = reduced_features.reset_index(drop=True)
riskperf_raw = reduced_features['RiskPerformance']
features = reduced_features.drop('RiskPerformance' , axis = 1)
dic={'Good':0, 'Bad':1}
riskperf = riskperf_raw.map(dic)
display(riskperf.head(n=5))
from gmean_score import Gscore
def scre(y_true, y_pred):
scre = Gscore(y_true, y_pred)
return scre.g_mean()
import gmean_score as gm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
features1 = features.copy()
feat_drop =['ExternalRiskEstimate']
features1.drop(feat_drop,axis=1, inplace=True)
#List to store computed scores
scr=[]
model = LogisticRegression()
# RFE model for different number of features
for i in range(1,23):
print("Number of features: ",i)
rfe = RFE(model, i)
rfe = rfe.fit(features1, riskperf)
# summarize the selection of the attributes
print('Selected features: %s' % list(features1.columns[rfe.support_]))
print('Score: %f' % rfe.score(features1, riskperf))
scr.append(scre(riskperf, rfe.predict(features1)))
print('G-mean: %f' % scr[i-1])
# Plot number of features VS. G_mean scores
plt.figure(figsize=(10,6))
plt.xticks(np.arange(0, 25, 1))
plt.xlabel("Number of features selected")
plt.ylabel("G_Mean score")
plt.plot(range(1, len(scr) + 1), scr)
plt.show()
Even with cross-validation of the dataset, the results are very similar to the recursive feature elimination (without cv). The highest score is obtained when using 18 features.
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer
scoring_fnc= make_scorer(scre)
rfecv = RFECV(estimator=LogisticRegression(), step=1, cv=10, scoring=scoring_fnc)
rfecv.fit(features1, riskperf)
print("Optimal number of features: %d" % rfecv.n_features_)
print('Selected features: %s' % list(features1.columns[rfecv.support_]))
# Plot number of features VS. cross-validation scores
plt.figure(figsize=(10,6))
plt.xticks(np.arange(0, 25, 1))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation Gmean score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
Using a decision tree classifer, an almost perfect G-mean score (99.5%) is achieved for a tree depth of 25 levels, both with and without the feature ExternalRiskEstimate. While the large tree depth is not ideal for interpretability, it was still a useful exercise to identify the most important features resulting in the very high predictive power. The most important features, excluding ExternalRiskEstimate due to its lack of transparency, are grouped in the following categories:
from sklearn.tree import DecisionTreeClassifier
for i in range(1,26,4):
print('\nTree depth: ', i)
classifier = DecisionTreeClassifier(max_depth=i)
classifier.fit(features, riskperf)
print('G-mean score is: ', scre(riskperf, classifier.predict(features)))
print('Feature importance:')
for name, importance in zip(features.columns, classifier.feature_importances_):
if importance>0.05:
print(name, importance)
ExternalRiskEstimate is the most important feature for the varying tree depths. However, since it is not very transparent how this feature is calculated by other credit bureaus, the decision tree classifier will be rerun excluding this feature to assess the impact on the Gmean score.
for i in range(1,26,4):
print('\nTree depth: ', i)
classifier1 = DecisionTreeClassifier(max_depth=i)
classifier1.fit(features1, riskperf)
print('G-mean score is: ', scre(riskperf, classifier1.predict(features1)))
print('Feature importance:')
for name, importance in zip(features1.columns, classifier1.feature_importances_):
if importance>0.05:
print(name, importance)
Using PCA, RFE and tree feature selection, the following 9 features consistently came up as important and 8 of them (ExternalRiskEstimate) were included in the final list of features for model training.
final_features =['RiskPerformance','MSinceOldestTradeOpen','AverageMInFile','NetFractionRevolvingBurden','NetFractionInstallBurden','NumBank2NatlTradesWHighUtilization','NumSatisfactoryTrades','NumTrades60Ever2DerogPubRec','MSinceMostRecentInqexcl7days']
heloc_final = reduced_features[final_features]
display(heloc_final.head())
heloc_final.to_excel('heloc_featureset_final.xlsx',index= False)